# -*-coding:utf-8 -*-

from HTMLParser import HTMLParser
from re import sub
from sys import stderr
from traceback import print_exc


class _DeHTMLParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.__text = []

    def handle_data(self, data):
        text = data.strip()
        if len(text) > 0:
            text = sub('[ \t\r\n]+', ' ', text)
            self.__text.append(text + ' ')

    def handle_starttag(self, tag, attrs):
        if tag == 'p':
            self.__text.append('\n\n')
        elif tag == 'br':
            self.__text.append('\n')

    def handle_startendtag(self, tag, attrs):
        if tag == 'br':
            self.__text.append('\n\n')

    def text(self):
        return ''.join(self.__text).strip()


def dehtml(textPath):
    f = open(textPath, 'r')
    text = f.read()
    try:
        parser = _DeHTMLParser()
        parser.feed(text)
        parser.close()
        return parser.text()
    except:
        print_exc(file=stderr)
        return text

# def WriteHtml():
#     f = open('E:\programs\\drop_test.html', 'r')
#     f.read()
#     f.close()


# def main():
#     f = open('E:\programs\\t1.html', 'r')
#     text = f.read()  # 接受HTML的内容
#     print(dehtml(text))
#     f.close()

# if __name__ == '__main__':
#     main()
